import numpy as np
import pandas as pd

np.random.seed(11)

n = 3000
with open("/Users/yuwang/Desktop/PMethods/Binary/rt-polaritydata/rt-polaritydata/rt-polarity.pos", encoding='latin-1') as doc:
	p = doc.readlines()[:n]

with open("/Users/yuwang/Desktop/PMethods/Binary/rt-polaritydata/rt-polaritydata/rt-polarity.neg", encoding='latin-1') as doc:
        n = doc.readlines()[:n]

data = []
for text in p:
	data.append([text.strip(), 1])
for text in n:
	data.append([text.strip(), 0])

df = pd.DataFrame(data)
df = df.rename(columns={0: "text", 1: "label"})
df = df.sample(frac=1).reset_index(drop=True)

df[:(2 * n - n / 3)].to_csv("train_dev.csv") # 6000 -> 1000, 3000 -> 500, 1500 -> 250 
df[(2 * n - n / 3):].to_csv("test.csv")

train_positives = sum(df[:4000]["label"])
validation_positives = sum(df[4000: 5000]["label"])
test_positives = sum(df[5000:]["label"])
print(train_positives, validation_positives, test_positives)
